#!/usr/bin/env python
# coding: utf-8

# In[1]:


exec(open('init_path.py').read())
exec((P_Lib/'GasStation.py').read_text())
get_ipython().run_line_magic('matplotlib', 'inline')


# ### Gas Stataion - Identify Rival

# In[2]:


# Gas Station Data
# GS = read_hdf(f_GS, 'GS').head(1000)
GS = read_hdf(f_GS, 'GS')
GS = GS[GS.Lat.notnull()]
GS.iloc[:2]


# #### Calculate Distance Across All Gas Stations

# In[3]:


# For all pairs of GS, calculate their distance
dict_GS_Lat = GS.set_index('StID').Lat.to_dict()
dict_GS_Long = GS.set_index('StID').Lng.to_dict()

ls = list(itertools.combinations(GS.StID.tolist(), 2)) # All combinations of two values in GS
df = DataFrame(np.zeros(len(ls)), columns=['GSPair'])
df['GSPair'] = ls

df['O'] = df.GSPair.str[0]
df['D'] = df.GSPair.str[1]
df['OLat'] = df.O.map(dict_GS_Lat)
df['OLong'] = df.O.map(dict_GS_Long)
df['DLat'] = df.D.map(dict_GS_Lat)
df['DLong'] = df.D.map(dict_GS_Long)

df.head(2)


# In[4]:


from geog import distance
df['Direct'] = distance(df[['OLong','OLat']].values, df[['DLong','DLat']].values)
df.head(2)


# In[5]:


df[['O','D','Direct']].to_hdf(P_GS_Data / 'GS' / 'gas_station_distance.h5', 'GS', mode='w', complevel=9, complib='blosc')
# Dist = read_hdf(P_GS_Data / 'GS' / 'gas_station_distance.h5', 'GS')
# Dist.head(2)


# ### Identify Rival

# In[2]:
df = read_hdf(P_GS_Data / 'GS' / 'gas_station_distance.h5', 'GS')
df.head(2)

# In[4]:


df.shape
#df = df.loc[df.Direct<=20000, ['O','D','Direct']]

df_flip = df.rename(columns={'O':'D','D':'O'})
df = df.append(df_flip, sort=False)
df.shape


# In[4]:


# Take the nearest GS as Rival
df = df.sort_values(by=['O','Direct'])
Rival = df.groupby('O')[['D','Direct']].first().reset_index()
Rival.head(2)

#Rival.to_hdf(P_GS_Data / 'GS' / 'rival.h5', 'GS', mode='w', complevel=9, complib='blosc')
Rival.to_stata(P_GS_Data / 'GS'/ 'nearest_neighbor.dta',write_index=False)

df = read_hdf(P_GS_Data / 'GS' / 'gas_station_distance.h5', 'GS')
df.head(2)


# In[3]:


# df.Direct
df[df.Direct<120_000].shape


# In[5]:


16257554*0.004


# In[6]:


# df.Direct
df[df.Direct<60_000].shape


# In[7]:


5209618*0.004


# In[4]:


df.shape


# In[3]:


# Only focus on pairs with distance <= 1KM
# df = df.loc[df.Direct<=10000, ['O','D','Direct']]
df = df.loc[df.Direct<=1000, ['O','D','Direct']]
#df = df.loc[df.Direct<=2000, ['O','D','Direct']]
df_flip = df.rename(columns={'O':'D','D':'O'})
df = df.append(df_flip, sort=False)
df.shape


# In[4]:


# Take the nearest GS as Rival
df = df.sort_values(by=['O','Direct'])
Rival = df.groupby('O')[['D','Direct']].first().reset_index()
Rival.head(2)


# In[5]:


side_by_side(Rival[Rival.O==3], Rival[Rival.O==7742])


# In[6]:



Rival.to_hdf(P_GS_Data / 'GS' / 'rival.h5', 'GS', mode='w', complevel=9, complib='blosc')


# In[7]:


# Rival = read_hdf(P_GS_Data / 'GS' / 'rival.h5', 'GS')
# Rival.head(2)


# In[8]:


dict_stid_to_rival = Rival.set_index('O').D.to_dict()
save_obj(dict_stid_to_rival, P_GS_Data / 'GS' / 'dict_stid_to_rival.pkl')
# save_obj(dict_stid_to_rival, P_GS_Data / 'GS' / 'dict_stid_to_rival_2km.pkl')
# dict_stid_to_rival = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_rival.pkl')


# ### Rival: DiffBrand

# In[9]:


df = read_hdf(P_GS_Data / 'GS' / 'rival.h5', 'GS')
#df = read_hdf(P_GS_Data / 'GS' / 'rival_2km.h5', 'GS')
df.head(2)


# In[10]:


dict_stid_to_brandid = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_brandid.pkl')
df['BrandIDO'] = df.O.map(dict_stid_to_brandid)
df['BrandIDD'] = df.D.map(dict_stid_to_brandid)
df = df.loc[df.BrandIDO!=df.BrandIDD, ['O','D','Direct']].copy()
df.head(2)


# In[11]:


df.to_hdf(P_GS_Data / 'GS' / 'rival_DiffBrand_perm.h5', 'GS', mode='w', complevel=9, complib='blosc')
#df.to_hdf(P_GS_Data / 'GS' / 'rival_DiffBrand_perm_2km.h5', 'GS', mode='w', complevel=9, complib='blosc')
# df = read_hdf(P_GS_Data / 'GS' / 'rival_DiffBrand_perm.h5', 'GS'); df.head(2)
dict_stid_to_rival = df.set_index('O').D.to_dict()
save_obj(dict_stid_to_rival, P_GS_Data / 'GS' / 'dict_stid_to_rival_DiffBrand_perm.pkl')
#save_obj(dict_stid_to_rival, P_GS_Data / 'GS' / 'dict_stid_to_rival_DiffBrand_perm_2km.pkl')


# In[14]:


# In[15]:


df.to_stata(P_GS_Data / 'GS' / 'rival_DiffBrand_perm.dta',)


# ### Rival: Mutual_Samebrand

# In[11]:


df = read_hdf(P_GS_Data / 'GS' / 'rival.h5', 'GS')
df.head(2)


# In[12]:


# Mutual: same pair that shows twice
df['ODPair'] = df[['O','D']].min(axis=1).astype(str) + '-' + df[['O','D']].max(axis=1).astype(str)
s = df.ODPair.value_counts()
ls_odpair_rival_mutual = s[s==2].index.tolist()
df = df[df.ODPair.isin(ls_odpair_rival_mutual)]
df.head(2)


# In[13]:


# Samebrand: O & D with SAME brands ONLY
dict_stid_to_brandid = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_brandid.pkl')
df['BrandIDO'] = df.O.map(dict_stid_to_brandid)
df['BrandIDD'] = df.D.map(dict_stid_to_brandid)
df = df.loc[df.BrandIDO==df.BrandIDD, ['O','D','Direct']].copy()
df.head(2)


# In[14]:


side_by_side(df[df.O==5], df[df.D==5])


# In[15]:


df.to_hdf(P_GS_Data / 'GS' / 'rival_Mutual_Samebrand_perm.h5', 'GS', mode='w', complevel=9, complib='blosc')
# df = read_hdf(P_GS_Data / 'GS' / 'rival_Mutual_Samebrand_perm.h5', 'GS'); df.head(2)
dict_stid_to_rival = df.set_index('O').D.to_dict()
save_obj(dict_stid_to_rival, P_GS_Data / 'GS' / 'dict_stid_to_rival_Mutual_Samebrand_perm.pkl')
# dict_stid_to_rival = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_rival_Mutual_Samebrand_perm.pkl')


# In[16]:


# Combination version: remove same pairs
df['ODPair'] = df[['O','D']].min(axis=1).astype(str) + '-' + df[['O','D']].max(axis=1).astype(str)
df = df.sort_values('O').drop_duplicates('ODPair', keep='first')
df.to_hdf(P_GS_Data / 'GS' / 'rival_Mutual_Samebrand_comb.h5', 'GS', mode='w', complevel=9, complib='blosc')
# df = read_hdf(P_GS_Data / 'GS' / 'rival_Mutual_Samebrand_comb.h5', 'GS'); df.head(2)
dict_stid_to_rival = df.set_index('O').D.to_dict()
save_obj(dict_stid_to_rival, P_GS_Data / 'GS' / 'dict_stid_to_rival_Mutual_Samebrand_comb.pkl')
# dict_stid_to_rival = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_rival_Mutual_Samebrand_comb.pkl')


# In[15]:


side_by_side(df[df.O==5], df[df.D==5])


# ### Rival: Mutual_Samebrand_Isolation

# In[68]:


df = read_hdf(P_GS_Data / 'GS' / 'gas_station_distance.h5', 'GS')
df.head(2)


# In[69]:


# extract the list of StID with a differently-branded nearby
df = df.loc[df.Direct<=5000, ['O','D']].copy()
df['OBrand'] = df.O.map(dict_stid_to_brandid)
df['DBrand'] = df.D.map(dict_stid_to_brandid)
df = df.loc[df.OBrand!=df.DBrand, ['O','D']]
df.head(2)


# In[70]:


ls_stid_with_rival_diffbrand_within5k = list(set(df.values.flatten()))


# In[71]:


save_obj(ls_stid_with_rival_diffbrand_within5k, P_GS_Data / 'GS' / 'ls_stid_with_rival_diffbrand_within5k.pkl')
# ls_stid_with_rival_diffbrand_within5k = load_obj(P_GS_Data / 'GS' / 'ls_stid_with_rival_diffbrand_within5k.pkl')


# ### Rival: Diffbrand_Top11brand

# In[3]:


df = read_hdf(P_GS_Data / 'GS' / 'rival.h5', 'GS')
df.head(2)


# In[4]:


# Diffbrand_Top11brand: O & D with DIFFERENT brands ONLY
dict_stid_to_brandid = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_brandid.pkl')
df['BrandIDO'] = df.O.map(dict_stid_to_brandid)
df['BrandIDD'] = df.D.map(dict_stid_to_brandid)
df = df.loc[(df.BrandIDO<=10)&(df.BrandIDD<=10)].copy()
df = df.loc[df.BrandIDO!=df.BrandIDD, ['O','D','Direct']].copy()
df.head(2)


# In[5]:


side_by_side(df[df.O==9], df[df.D==9])


# In[6]:


df.to_hdf(P_GS_Data / 'GS' / 'rival_Diffbrand_Top11brand_perm.h5', 'GS', mode='w', complevel=9, complib='blosc')
# df = read_hdf(P_GS_Data / 'GS' / 'rival_Diffbrand_Top11brand_perm.h5', 'GS'); df.head(2)
dict_stid_to_df = df.set_index('O').D.to_dict()
save_obj(dict_stid_to_df, P_GS_Data / 'GS' / 'dict_stid_to_rival_Diffbrand_Top11brand_perm.pkl')
# dict_stid_to_rival = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_rival_Diffbrand_Top11brand_perm.pkl')


# In[ ]:





# ### Rival: Mutual_Diffbrand_Top11brand

# In[2]:


df = read_hdf(P_GS_Data / 'GS' / 'rival.h5', 'GS')
df.head(2)


# In[3]:


# Mutual: same pair that shows twice
df['ODPair'] = df[['O','D']].min(axis=1).astype(str) + '-' + df[['O','D']].max(axis=1).astype(str)
s = df.ODPair.value_counts()
ls_odpair_rival_mutual = s[s==2].index.tolist()
df = df[df.ODPair.isin(ls_odpair_rival_mutual)]
df.head(2)


# In[4]:


# Diffbrand_Top11brand: O & D with DIFFERENT brands ONLY
dict_stid_to_brandid = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_brandid.pkl')
df['BrandIDO'] = df.O.map(dict_stid_to_brandid)
df['BrandIDD'] = df.D.map(dict_stid_to_brandid)
df = df.loc[(df.BrandIDO<=10)&(df.BrandIDD<=10)].copy()
df = df.loc[df.BrandIDO!=df.BrandIDD, ['O','D','Direct']].copy()
df.head(2)


# In[5]:


side_by_side(df[df.O==9], df[df.D==9])


# In[6]:


df.to_hdf(P_GS_Data / 'GS' / 'rival_Mutual_Diffbrand_Top11brand_perm.h5', 'GS', mode='w', complevel=9, complib='blosc')
# df = read_hdf(P_GS_Data / 'GS' / 'rival_Mutual_Diffbrand_Top11brand_perm.h5', 'GS'); df.head(2)
dict_stid_to_df = df.set_index('O').D.to_dict()
save_obj(dict_stid_to_df, P_GS_Data / 'GS' / 'dict_stid_to_rival_Mutual_Diffbrand_Top11brand_perm.pkl')
# dict_stid_to_rival = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_rival_Mutual_Diffbrand_Top11brand_perm.pkl')


# In[7]:


# Combination version: remove same pairs
df['ODPair'] = df[['O','D']].min(axis=1).astype(str) + '-' + df[['O','D']].max(axis=1).astype(str)
df = df.sort_values('O').drop_duplicates('ODPair', keep='first')
df.to_hdf(P_GS_Data / 'GS' / 'rival_Mutual_Diffbrand_Top11brand_comb.h5', 'GS', mode='w', complevel=9, complib='blosc')


# In[8]:


# df = read_hdf(P_GS_Data / 'GS' / 'rival_Mutual_Diffbrand_Top11brand_comb.h5', 'GS'); df.head(2)
dict_stid_to_df = df.set_index('O').D.to_dict()
save_obj(dict_stid_to_df, P_GS_Data / 'GS' / 'dict_stid_to_rival_Mutual_Diffbrand_Top11brand_comb.pkl')
# dict_stid_to_rival = load_obj(P_GS_Data / 'GS' / 'dict_stid_to_rival_Mutual_Diffbrand_Top11brand_comb.pkl')


# In[9]:


side_by_side(df[df.O==9], df[df.D==9])


# ### Top N Competitors

# In[16]:


Dist = read_hdf(P_GS_Data / 'GS' / 'gas_station_distance.h5', 'GS')
Dist.head(2)


# In[17]:


# Limit to Competitors within 10KM
Dist = Dist[Dist.Direct<=10000]
Dist.head(2)


# In[18]:


# append Reversed Dist (to get all permutations of O&D)
Dist = Dist.append(Dist.rename(columns={'O':'D','D':'O'}), sort=False).sort_values(['O','Direct'])
print(Dist.O.nunique())
print(Dist.head(2))


# In[19]:


# Only Focus on Stations with More Than 30 GS within 10km
nRivals = Dist.groupby('O').D.count()
ls_gs_keep = nRivals[nRivals>=15].index.tolist()
Dist = Dist[Dist.O.isin(ls_gs_keep)]
print(Dist.O.nunique())
print(Dist.head(2))


# In[20]:


# Take Top 30 GS Only
Dist = Dist.groupby('O').head(15)
print(Dist.O.nunique())
print(Dist.head(2))


# In[21]:


# Add Order: Dist.shape[0]==30 * Dist.O.nunique()
Dist['Order'] = Dist.O.nunique() * list(range(15))
Dist.head(2)


# In[22]:


Dist.to_hdf(P_GS_Data / 'GS' / 'rival_top30rivals_perm.h5', 'GS', mode='w', complevel=9, complib='blosc')


# In[ ]:


# Rival = read_hdf(P_GS_Data / 'GS' / 'rival_top30rivals_perm.h5', 'GS'); Rival.head(2)


# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:





# In[ ]:




